3. Train-Predict-XGBoost

Tensorboard


In [46]:
import time
import os
import pandas as pd

project_name = 'Dog_Breed_Identification'
step_name = 'Train-Predict-XGBoost'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)

cwd = os.getcwd()
log_path = os.path.join(cwd, 'log')
model_path = os.path.join(cwd, 'model')
output_path = os.path.join(cwd, 'output')
print('log_path: \t' + log_path)
print('model_path: \t' + model_path)
print('output_path: \t' + output_path)


run_name: Dog_Breed_Identification_Train-Predict-XGBoost_20171101_221638
log_path: 	E:\Udacity\MachineLearning(Advanced)\p6_graduation_project\log
model_path: 	E:\Udacity\MachineLearning(Advanced)\p6_graduation_project\model
output_path: 	E:\Udacity\MachineLearning(Advanced)\p6_graduation_project\output

In [27]:
df = pd.read_csv(os.path.join(cwd, 'input', 'labels.csv'))
print('lables amount: %d' %len(df))
df.head()


lables amount: 10222
Out[27]:
id breed
0 000bec180eb18c7604dcecc8fe0dba07 boston_bull
1 001513dfcb2ffafc82cccf4d8bbaba97 dingo
2 001cdf01b096e06d78e9e5112d419397 pekinese
3 00214f311d5d2247d5dfe4fe24b2303d bluetick
4 0021f9ceb3235effd7fcde7f7538ed62 golden_retriever

In [28]:
import h5py
import numpy as np
from sklearn.utils import shuffle
np.random.seed(2017)

x_train = []
y_train = {}
x_val = []
y_val = {}
x_test = []

cwd = os.getcwd()
feature_cgg16 = os.path.join(cwd, 'model', 'feature_VGG16_{}.h5'.format(20171026))
feature_cgg19 = os.path.join(cwd, 'model', 'feature_VGG19_{}.h5'.format(20171026))
feature_resnet50 = os.path.join(cwd, 'model', 'feature_ResNet50_{}.h5'.format(20171026))
feature_xception = os.path.join(cwd, 'model', 'feature_Xception_{}.h5'.format(20171026))
feature_inception = os.path.join(cwd, 'model', 'feature_InceptionV3_{}.h5'.format(20171026))
# feature_inceptionResNetV2 = os.path.join(cwd, 'model', 'feature_InceptionResNetV2_{}.h5'.format(20171028))
for filename in [feature_cgg16, feature_cgg19, feature_resnet50, feature_xception, feature_inception]:
    with h5py.File(filename, 'r') as h:
        x_train.append(np.array(h['train']))
        y_train = np.array(h['train_labels'])
        x_test.append(np.array(h['test']))

# print(x_train[0].shape)
x_train = np.concatenate(x_train, axis=-1)
# y_train = np.concatenate(y_train, axis=0)
# x_val = np.concatenate(x_val, axis=-1)
# y_val = np.concatenate(y_val, axis=0)
x_test = np.concatenate(x_test, axis=-1)
print(x_train.shape)
print(x_train.shape[1:])

print(len(y_train))
# print(x_val.shape)
# print(len(y_val))
print(x_test.shape)


(10222, 7168)
(7168,)
10222
(10357, 7168)

In [29]:
from sklearn.utils import shuffle
(x_train, y_train) = shuffle(x_train, y_train)

In [30]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.05, random_state=2017)
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)


(9710, 7168)
(9710,)
(512, 7168)
(512,)

In [31]:
from keras.utils.np_utils import to_categorical

# y_train = to_categorical(y_train)
# y_val = to_categorical(y_val)
print(y_train.shape)
print(y_val.shape)


(9710,)
(512,)

Build Model


In [32]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [43]:
%%time
xg_train = xgb.DMatrix(x_train, label=y_train)
xg_val = xgb.DMatrix(x_val, label=y_val)
xg_test = xgb.DMatrix(x_test)
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 50
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 120

watchlist = [(xg_train, 'train'), (xg_val, 'val')]
num_round = 5
bst = xgb.train(param, xg_train, num_round, watchlist)


[0]	train-merror:0.252935	val-merror:0.361328
[1]	train-merror:0.14861	val-merror:0.298828
[2]	train-merror:0.106076	val-merror:0.257812
[3]	train-merror:0.075695	val-merror:0.242188
[4]	train-merror:0.054892	val-merror:0.226562
Wall time: 14min 47s

In [44]:
model_name = run_name + '.bin'
bst.save_model(model_name)

In [45]:
bst0 = xgb.Booster({'nthread': 4})  # init model
bst0.load_model(model_name)  # load data

In [49]:
y_pred = bst0.predict(xg_val)
print(y_pred.shape)
print(y_pred[0:5])


(512,)
[  0.   5.  14.  94.   2.]

In [50]:
# do the same thing again, but output probabilities
param['objective'] = 'multi:softprob'
bst1 = xgb.train(param, xg_train, num_round, watchlist)
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
# pred_prob = bst0.predict(xg_val).reshape(test_Y.shape[0], 6)
# pred_label = np.argmax(pred_prob, axis=1)
# error_rate = np.sum(pred_label != test_Y) / test_Y.shape[0]
# print('Test error using softprob = {}'.format(error_rate))


[0]	train-merror:0.252935	val-merror:0.361328
[1]	train-merror:0.14861	val-merror:0.298828
[2]	train-merror:0.106076	val-merror:0.257812
[3]	train-merror:0.075695	val-merror:0.242188
[4]	train-merror:0.054892	val-merror:0.226562

In [52]:
model_name = run_name + '_prob.bin'
bst1.save_model(model_name)

In [53]:
bst0 = xgb.Booster({'nthread': 4})  # init model
bst0.load_model(model_name)  # load data

In [51]:
y_pred = bst1.predict(xg_val)
print(y_pred.shape)
print(y_pred[0:5])


(512, 120)
[[ 0.2974295   0.00490525  0.00490315  0.00490537  0.00500645  0.00490574
   0.0049357   0.00490251  0.00490329  0.00490563  0.00490444  0.00490323
   0.00494976  0.00490511  0.00490492  0.00490526  0.0049048   0.00496223
   0.00490462  0.00490526  0.00490454  0.00490512  0.00577949  0.00490558
   0.00490477  0.00490458  0.0071401   0.00490558  0.00490444  0.00490545
   0.00502031  0.00490284  0.0049055   0.00490569  0.00490459  0.00563949
   0.00490418  0.00490501  0.00555472  0.00490506  0.00490551  0.00500069
   0.00490202  0.00490547  0.00490484  0.00490522  0.00490505  0.00528597
   0.00490526  0.00490467  0.00490415  0.00492774  0.00490459  0.00490551
   0.00662434  0.00490357  0.00490445  0.00490508  0.00589457  0.00490437
   0.00490517  0.00490458  0.00490286  0.00490502  0.04553157  0.00490419
   0.00490526  0.00490534  0.00490429  0.00490533  0.0049056   0.00490531
   0.00490403  0.0049024   0.00490438  0.00490484  0.00490572  0.00621409
   0.06089636  0.00490542  0.00490123  0.00490466  0.00490556  0.00490487
   0.00490427  0.0049045   0.0049055   0.00490346  0.00490342  0.00490521
   0.00490474  0.00490423  0.00490513  0.00490549  0.00528533  0.00490312
   0.01256359  0.00490426  0.00490406  0.00490497  0.00569648  0.00490533
   0.00490544  0.0049052   0.00490549  0.00496761  0.00494197  0.00490408
   0.00490494  0.00490511  0.00711655  0.00567312  0.00490527  0.00490543
   0.00490357  0.00528359  0.00536904  0.0049822   0.00490476  0.00500313]
 [ 0.00664251  0.00664275  0.0066399   0.00664292  0.00715593  0.13026524
   0.00664294  0.00663904  0.00664009  0.00664326  0.00664165  0.00975148
   0.00681564  0.00664256  0.007697    0.00664276  0.00664214  0.01204315
   0.00664189  0.00680158  0.00664179  0.00664257  0.00664207  0.00664319
   0.0066421   0.00664184  0.00664245  0.00933286  0.00664166  0.00854256
   0.00663859  0.00663948  0.00664309  0.02280047  0.00664186  0.00673126
   0.0066413   0.00664242  0.00735943  0.00664249  0.00664311  0.00664071
   0.0107274   0.00664305  0.0066422   0.0066427   0.01167289  0.00664258
   0.00664276  0.00751382  0.00664126  0.00664255  0.00664185  0.00727148
   0.00664281  0.00664047  0.00664166  0.00664252  0.00663977  0.00664155
   0.00664264  0.00664184  0.00663951  0.00963491  0.00664108  0.00664132
   0.00664277  0.00703968  0.00670006  0.00667426  0.00664322  0.00664283
   0.0066411   0.00663888  0.00664157  0.02073119  0.00664338  0.00667883
   0.01736585  0.00715883  0.00663731  0.00664195  0.00664317  0.00664223
   0.00675325  0.00664173  0.00664308  0.00664033  0.00664027  0.0066427
   0.00670577  0.00664136  0.00664258  0.00769393  0.00664221  0.00663986
   0.00664279  0.00664141  0.00664113  0.00664237  0.00664212  0.00664286
   0.00679436  0.00664268  0.00664307  0.00664133  0.00664307  0.00664115
   0.01184149  0.00664256  0.0066424   0.00664157  0.00664277  0.00664299
   0.00787519  0.00664094  0.00664299  0.00664273  0.00668845  0.00669705]
 [ 0.0051162   0.00511637  0.00511418  0.0051165   0.00511602  0.00511689
   0.00511652  0.00511352  0.05320547  0.00511677  0.00511553  0.00511426
   0.02889798  0.00511623  0.30715349  0.00511638  0.00511591  0.00511535
   0.00511571  0.00511639  0.00511564  0.00511624  0.00511585  0.00511672
   0.00511587  0.005234    0.00511615  0.00511672  0.00511553  0.00511659
   0.00511317  0.00741165  0.00511664  0.00511683  0.00511569  0.00511491
   0.00511526  0.00511613  0.00517258  0.00511617  0.00511665  0.00556918
   0.00511301  0.00511661  0.00511595  0.00511634  0.00640045  0.00511625
   0.00511638  0.00511577  0.00511523  0.00511622  0.00511569  0.00511665
   0.00511643  0.00511462  0.00511554  0.0051162   0.00511408  0.00511545
   0.00511629  0.00511567  0.00511388  0.00511613  0.00511509  0.00511527
   0.00511639  0.00511647  0.00780102  0.00511646  0.00527611  0.00511644
   0.00578488  0.0051134   0.00511547  0.00511595  0.00511686  0.00511676
   0.00511666  0.00511656  0.00511218  0.00511576  0.0051167   0.00533481
   0.00511535  0.00511559  0.00511663  0.00511451  0.00511447  0.00525322
   0.00511584  0.00511531  0.00511625  0.00627326  0.00511596  0.00511415
   0.0051164   0.00511535  0.00511513  0.00511609  0.00522371  0.00511646
   0.00511657  0.00511632  0.00551276  0.00511529  0.00571635  0.00679448
   0.00521185  0.00519542  0.00511611  0.00511547  0.00511639  0.00511656
   0.00511463  0.00511498  0.00511656  0.00511636  0.00511586  0.00511636]
 [ 0.00502803  0.00499871  0.00499657  0.00499884  0.00499836  0.00499921
   0.00499885  0.00499592  0.00499671  0.0049991   0.00499788  0.00499665
   0.00499837  0.00499857  0.00499838  0.00499872  0.00504145  0.00499771
   0.00499807  0.00499872  0.00499799  0.00499857  0.0049982   0.00499904
   0.00499822  0.00499803  0.00499849  0.00508885  0.00499789  0.00499892
   0.02519783  0.00499625  0.00499897  0.0052715   0.00499804  0.00499727
   0.00499762  0.00499847  0.00499857  0.00538681  0.00499898  0.00499717
   0.00499542  0.00499894  0.0049983   0.00499868  0.00499851  0.00499859
   0.00499872  0.00499812  0.00499759  0.01606635  0.00499804  0.00499898
   0.00499876  0.00545304  0.00499789  0.00565864  0.00499647  0.00499781
   0.00499863  0.00499803  0.00672605  0.00499847  0.00499746  0.00499763
   0.00499872  0.00499881  0.00504184  0.00499879  0.00499907  0.00499878
   0.00499747  0.00506755  0.00499782  0.00510018  0.0050289   0.00499909
   0.00499899  0.00499889  0.00499462  0.00499811  0.00499903  0.00499832
   0.00514893  0.00499795  0.00499896  0.00499689  0.00499685  0.00499867
   0.00499819  0.00499767  0.00499859  0.00499896  0.36800078  0.00614943
   0.00499874  0.0049977   0.00499749  0.00499843  0.00499824  0.00499879
   0.0049989   0.00499866  0.00499896  0.00566063  0.00499896  0.00499751
   0.00506728  0.00499857  0.00499845  0.00499782  0.00499873  0.00499889
   0.004997    0.00499735  0.00499889  0.0049987   0.00499821  0.00499869]
 [ 0.00524388  0.00524406  0.35276937  0.0052442   0.0052437   0.00524459
   0.00543621  0.00524114  0.00524197  0.00524447  0.0052432   0.0052419
   0.00524371  0.00524391  0.00524371  0.00524407  0.00524358  0.00524302
   0.00565103  0.00544742  0.00524331  0.00524392  0.00524353  0.00524441
   0.00524355  0.00524335  0.00532671  0.00524442  0.0052432   0.00524428
   0.00524078  0.00524149  0.00524433  0.00524453  0.00524336  0.00531893
   0.00529505  0.0062623   0.00524392  0.00524386  0.00524435  0.00524245
   0.00524061  0.0052443   0.00524363  0.01015866  0.00524385  0.00524394
   0.00524407  0.00524345  0.00529815  0.0052681   0.0056493   0.00524435
   0.00524412  0.00524227  0.00571694  0.00524388  0.0056489   0.00524312
   0.0145527   0.00530208  0.00524151  0.00524382  0.00524275  0.00524293
   0.00524408  0.00524417  0.00524303  0.00524415  0.00524444  0.00524413
   0.00524276  0.00524101  0.00524313  0.00524363  0.00524456  0.00524446
   0.00524435  0.00524425  0.00523977  0.00524344  0.0052444   0.00643719
   0.00524301  0.00524326  0.00533296  0.00524216  0.00524211  0.00524402
   0.00593787  0.00761592  0.00524393  0.00593885  0.00524364  0.00524179
   0.00524409  0.00524301  0.00524279  0.00524377  0.00524357  0.00524415
   0.00524427  0.00524401  0.00524432  0.00524295  0.00537048  0.00524281
   0.00524373  0.00524392  0.00524379  0.00524313  0.00524408  0.00524426
   0.00524227  0.00524264  0.00524426  0.00524405  0.00524354  0.00565101]]

In [ ]:


In [ ]:


In [ ]:
run_name0 = run_name + '_' + str(int(final_acc*10000)).zfill(4)

In [ ]:

Predict


In [ ]:
# Used to load model directly and skip train
# import os
# from keras.models import load_model
# cwd = os.getcwd()
# model = load_model(os.path.join(cwd, 'model', 'Dog_Breed_Identification_Train_20171024_155154.h5'))

In [ ]:
y_pred = model.predict(x_test, batch_size=128)
print(y_pred.shape)

In [ ]:
# print(y_pred[:10])
# y_pred = np.clip(y_pred, 0.005, 0.995)
# print(y_pred[:10])

In [ ]:
files = os.listdir(os.path.join(cwd, 'input', 'data_test', 'test'))
print(files[:10])

In [ ]:
cwd = os.getcwd()
df = pd.read_csv(os.path.join(cwd, 'input', 'labels.csv'))
print('lables amount: %d' %len(df))
df.head()

In [ ]:
n = len(df)
breed = set(df['breed'])
n_class = len(breed)
class_to_num = dict(zip(breed, range(n_class)))
num_to_class = dict(zip(range(n_class), breed))
print(breed)

In [ ]:
df2 = pd.read_csv('.\\input\\sample_submission.csv')
n_test = len(df2)
print(df2.shape)

In [ ]:
for i in range(0, 120):
    df2.iloc[:,[i+1]] = y_pred[:,i]
if not os.path.exists(output_path):
    os.mkdir(output_path)
pred_file = os.path.join(output_path, 'pred_' + run_name0 + '.csv')
df2.to_csv(pred_file, index=None)

In [ ]:


In [ ]:
print(run_name0)
print('Done !')

In [ ]: